import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 14
dataOld = pd.read_csv('Processed_Data_part.csv')
print len(dataOld)
dataOld.head()
data = pd.read_csv('mine_accidents.csv')
print len(data)
data.head()
data.drop(['occup'], axis=1, inplace=True)
data.head()
data.drop(['narrtxt1', 'narrtxt2'], axis=1, inplace=True)
data.head()
y = data['accident_type'].unique()
print len(y)
y
data["accident_type"].value_counts()
from __future__ import division
print 'Size without nan: ', len(data.contractor.dropna())
print 'Size Total: ', len(data.contractor)
print 'Percentage: ', (len(data.contractor.dropna())/len(data.contractor))*100
data.contractor.fillna('same_mine', inplace=True)
data.head()
print 'Size without nan: ', len(data.underground_method.dropna())
print 'Size Total: ', len(data.underground_method)
print 'Percentage: ', (len(data.underground_method.dropna())/len(data.underground_method))*100
data.underground_method.value_counts()
data.underground_method.fillna('Continuous', inplace=True)
data.underground_method.head()
data.equipment_model.head()
print 'Size without nan: ', len(data.equipment_model.dropna())
print 'Size Total: ', len(data.equipment_model)
print 'Percentage: ', (len(data.equipment_model.dropna())/len(data.equipment_model))*100
data.equipment_model.value_counts().head()
data.drop(['equipment_model'], axis=1, inplace=True)
data.head(5)
data.dropna(inplace=True)
print len(data)
data.head()
from collections import defaultdict
encodersDict = defaultdict(LabelEncoder)
def categoricalColumns(df):
df = df.copy()
cols = df.columns
cols_numeric = df._get_numeric_data().columns
return list(set(cols) - set(cols_numeric))
def categoricalToNumeric(df):
df = df.copy()
cat_columns = categoricalColumns(df)
print('Categorical columns: ', cat_columns)
print('Size columns: ', len(cat_columns))
if cat_columns:
for category in cat_columns:
# encoder = LabelEncoder()
# df.loc[:, category] = encoder.fit_transform(df[category])
x = df[category]
df.loc[:, category] = encodersDict[x.name].fit_transform(x)
return df
Todas los objetos LabelEncoder se encuentra en el dictionary d, para su posterior uso.
dataNew = categoricalToNumeric(data)
dataNew.head()
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['accident_type'])
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Perform cross-validation:
# if performCV:
# cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['Disbursed'], cv=cv_folds, scoring='roc_auc')
# #Print model report:
# print "\nModel Report"
# print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
# print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
# if performCV:
# print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances', figsize=(14.,7.))
plt.ylabel('Feature Importance Score')
plt.show()
plt.close()
return feat_imp
%%time
target = 'accident_type'
#IDcol = 'mineid'
predictors = [x for x in dataNew if x not in [target]]
model = ExtraTreesClassifier(random_state=10)
fi = modelfit(model, dataNew, predictors)
print fi.head(6)
%%time
target = 'accident_type'
#IDcol = 'mineid'
predictors = [x for x in dataNew if x not in [target]]
model = GradientBoostingClassifier(random_state=10)
fi = modelfit(model, dataNew, predictors)
print fi.head(6)
Se observa que las variables accident_injury_illness, nature_injury, source_injury son las que mayor score obtienen, esto significa que estas variables tienen una mayor relevancia para la variable de respuesta accident_type, y por ello, aportarán en gran medida al perfomance del modelo.
accident_injury_illness: Powered haulage, Slip or fall of person (from an elevation or on the same level), Handling material, etc.
nature_injury : Sprain, strains; Burn or scald (heat), etc.
source_injury : Explosives, Flame, fire, smoke NEC, Surface mining machines, etc.
%%time
dataNew.hist(figsize=(36.,36.), layout=(10,6))
plt.show()
plt.close()
dataNew.accident_injury_illness.value_counts()
encodersDict['accident_injury_illness'].inverse_transform([12,15,8,7,16])
La mayor cantidad de accidentes ocurre en estas clases de actividades
dataNew.nature_injury.value_counts()
encodersDict['nature_injury'].inverse_transform([19,11,6,14,4])
from collections import OrderedDict
dictFI = OrderedDict(fi.head(20))
dictFI
%%time
features = dictFI.keys()
features.append('accident_type')
def plot_heatmap(df):
fig, axes = plt.subplots(figsize=(20,20))
sns.heatmap(df, annot=True)
plt.show()
plt.close()
plot_heatmap(dataNew[features].corr(method='pearson'))
from sklearn.preprocessing import MinMaxScaler, StandardScaler
dataScaled = dataNew.copy()
scaler = StandardScaler()
dictFI2 = OrderedDict(fi)
dictFI2 = dictFI2.keys()
dataScaled[dictFI2] = scaler.fit_transform(dataScaled[dictFI2])
dataScaled.head()
dataNew.head(10)
scatter = dictFI.keys()[:5]
scatter.append('accident_type')
scatter
%%time
sns.pairplot(dataNew[scatter], hue="accident_type", size=3)
plt.show()
%%time
from pandas.tools.plotting import scatter_matrix
scatter_matrix(dataNew[scatter], alpha=0.2, figsize=(20, 20), diagonal='kde')
%%time
from pandas.tools.plotting import andrews_curves
andrew = dictFI.keys()[:5]
andrew.append('accident_type')
plt.figure(figsize=(20,20))
andrews_curves(dataNew[andrew], 'accident_type')
plt.show()
%%time
from pandas.tools.plotting import parallel_coordinates
parallel = dictFI.keys()[:5]
parallel.append('accident_type')
plt.figure(figsize=(20,20))
parallel_coordinates(dataNew[parallel], 'accident_type')
plt.show()
Se observa que al plotear las variables agrupadas con la variable accident_type los puntos y lineas no se logran diferenciar muy bien, esto es por que se esta prediciendo 42 clases. Se podria reducir la cantidad de clases, reclasificando los tipos de accidentes, es decir, los menos frecuentes se pondrian 'in the same bag', con esto se lograria una delineacion de fronteras mas visible (logistic, pca,svm,decision tree).
# box = dictFI.keys()[:6]
# box.append('accident_type')
# dataNew[box].plot(kind="box", figsize=(16.,16.))
# plt.xticks(rotation='vertical')
# %%time
# import define
# import analyze
# import prepare
# import feature_selection
# import evaluate
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.svm import SVC
# from sklearn import cross_validation
# import pandas as pd
# #name = "datasets/iris.csv"
# name = "datasets/Processed_Data_part.csv"
# #name = "datasets/LocalizationOld.csv"
# #name = "datasets/seguridad.csv"
# #name = "datasets/breast-cancer-wisconsin.csv"
# #name = "breast-cancer-wisconsin.csv"
# #name = "inputBus.csv"
# # className = "Ruta"
# #className = "CATEGORY"
# #className = "class"
# className = "position"
# #STEP 0: Define workflow parameters
# definer = define.Define(nameData=name, className=className).pipeline()
# #STEP 1: Analyze data by ploting it
# #analyze.Analyze(definer).pipeline()
# #STEP 2: Prepare data by scaling, normalizing, etc.
# preparer = prepare.Prepare(definer).pipeline()
# #STEP 3: Feature selection
# featurer = feature_selection.FeatureSelection(definer).pipeline()
# #STEP4: Evalute the algorithms by using the pipelines
# evaluator = evaluate.Evaluate(definer, preparer, featurer).pipeline()